# DELETE BEFORE PUBLISHING
# This is just here so you can preview the styling on your local machine
from IPython.core.display import HTML
HTML("""
<style>
.usecase-title, .usecase-duration, .usecase-section-header {
padding-left: 15px;
padding-bottom: 10px;
padding-top: 10px;
padding-right: 15px;
background-color: #0f9295;
color: #fff;
}
.usecase-title {
font-size: 1.7em;
font-weight: bold;
}
.usecase-authors, .usecase-level, .usecase-skill {
padding-left: 15px;
padding-bottom: 7px;
padding-top: 7px;
background-color: #baeaeb;
font-size: 1.4em;
color: #121212;
}
.usecase-level-skill {
display: flex;
}
.usecase-level, .usecase-skill {
width: 50%;
}
.usecase-duration, .usecase-skill {
text-align: right;
padding-right: 15px;
padding-bottom: 8px;
font-size: 1.4em;
}
.usecase-section-header {
font-weight: bold;
font-size: 1.2em;
}
.usecase-subsection-header, .usecase-subsection-blurb {
font-weight: bold;
font-size: 1.2em;
color: #121212;
}
.usecase-subsection-blurb {
font-size: 1em;
font-style: italic;
}
</style>
""")
As a City of Melbourne council worker, I want to visualise and provide statistics on upcoming activities and planned works in entertainment and leisure, so that I can understand impact for my local area.
I also want to know which entertainment locations are projected as growth areas.
At the end of this use case you will understand what entertainment and leisure venues are in a small area, and if the locations is projected as a growth area.
This means learning how to:
The City of Melbourne (COM) conducts a census of all local businesses every two years. The last published survey was in 2020, the next survey results are expected soon.
The CLUE datasets contain information on venues:
#Libraries to be installed
##!pip -q is to give less output
!pip -q install seaborn
!pip -q install pandas
!pip -q install matplotlib
!pip -q install numpy
!pip -q install nbconvert
!pip -q install keyboard
!pip -q install geopandas
!pip -q install requests
!pip -q install folium
!pip -q install statsmodels
!pip -q install tqdm
#load libraries
import os
import io
import time
import keyboard
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import requests
import zipfile
import numpy as np
import pandas as pd
from urllib.request import urlopen
import json
from pandas import json_normalize
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
from IPython.core.display import display, HTML
import geopandas as gpd
import plotly.graph_objects as go
import plotly.express as px
from shapely.geometry import Polygon, shape, Point, box
from shapely.wkt import loads
from shapely.ops import unary_union
from tqdm import tqdm
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('ggplot')
from pylab import rcParams
rcParams['figure.figsize'] = 8,4
#set default values
this_decade = (pd.Timestamp.today().year)-10
this_year = pd.Timestamp.today().year
y3 = (pd.Timestamp.today().year)-3
y2 = (pd.Timestamp.today().year)-2
y1 = (pd.Timestamp.today().year)-1
#Replacement for socrata
domain = 'https://data.melbourne.vic.gov.au/explore/dataset/'
baseurl = '/download/?format=json&timezone=Australia/Sydney&lang=en'
basegeourl='/download/?format=geojson&timezone=Australia/Sydney&lang=en'
#spatial layer used to map CLUE datasets to CLUE blocks
dsurl = 'blocks-for-census-of-land-use-and-employment-clue'
GeoJSONURL = domain + dsurl + basegeourl
#print(GeoJSONURL)
clueblocks = requests.get(GeoJSONURL).json()
#clueblocks["features"][0]
#Load Bar, tavern, pub patron capacity dataset
dsurl = 'bars-and-pubs-with-patron-capacity'
url = domain + dsurl + baseurl
#print(url)
data_json = requests.get(url).json()
data_json_df = pd.DataFrame.from_dict(data_json)
#this flattens the features
df_btp_capacity=json_normalize(data_json_df['fields'])
df_btp_capacity.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4402 entries, 0 to 4401 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 4382 non-null float64 1 census_year 4402 non-null object 2 building_address 4402 non-null object 3 trading_name 4402 non-null object 4 location 4382 non-null object 5 property_id 4402 non-null int64 6 business_address 4402 non-null object 7 latitude 4382 non-null float64 8 clue_small_area 4402 non-null object 9 block_id 4402 non-null int64 10 number_of_patrons 4402 non-null int64 11 base_property_id 4402 non-null int64 dtypes: float64(2), int64(4), object(6) memory usage: 412.8+ KB
#transform
integer_columns = ['census_year', 'block_id', 'property_id', 'base_property_id', 'number_of_patrons']
str_columns = ['building_address', 'clue_small_area', 'trading_name']
float_columns = ['longitude', 'latitude']
df_btp_capacity[integer_columns] = df_btp_capacity[integer_columns].astype(int)
df_btp_capacity[float_columns] = df_btp_capacity[float_columns].astype(float)
df_btp_capacity[str_columns] = df_btp_capacity[str_columns].astype(str)
#Add column with description Pubs, Taverns and Bars for grouping
df_btp_capacity['category'] = 'Pubs, Taverns and Bars'
#limit data to past decade
df_btp_capacity=df_btp_capacity.query("census_year >= @this_decade")
df_btp_capacity.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2659 entries, 569 to 4401 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 2645 non-null float64 1 census_year 2659 non-null int32 2 building_address 2659 non-null object 3 trading_name 2659 non-null object 4 location 2645 non-null object 5 property_id 2659 non-null int32 6 business_address 2659 non-null object 7 latitude 2645 non-null float64 8 clue_small_area 2659 non-null object 9 block_id 2659 non-null int32 10 number_of_patrons 2659 non-null int32 11 base_property_id 2659 non-null int32 12 category 2659 non-null object dtypes: float64(2), int32(5), object(6) memory usage: 238.9+ KB
#Load Cafe, restaurant, bistro seats dataset
dsurl = 'cafes-and-restaurants-with-seating-capacity'
url = domain + dsurl + baseurl
#print(url)
data_json = requests.get(url).json()
data_json_df = pd.DataFrame.from_dict(data_json)
#this flattens the features
df_crb=json_normalize(data_json_df['fields'])
df_crb.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 56987 entries, 0 to 56986 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 56460 non-null object 1 seating_type 56987 non-null object 2 census_year 56987 non-null object 3 property_id 56987 non-null int64 4 base_property_id 56987 non-null int64 5 trading_name 56987 non-null object 6 block_id 56987 non-null int64 7 industry_anzsic4_description 56987 non-null object 8 number_of_seats 56987 non-null int64 9 building_address 56987 non-null object 10 clue_small_area 56987 non-null object 11 business_address 56987 non-null object 12 industry_anzsic4_code 56987 non-null int64 13 longitude 56460 non-null float64 14 latitude 56460 non-null float64 dtypes: float64(2), int64(5), object(8) memory usage: 6.5+ MB
#transform
integer_columns = ['census_year', 'block_id', 'property_id', 'base_property_id', 'number_of_seats']
str_columns = ['clue_small_area', 'trading_name','industry_anzsic4_description','seating_type']
df_crb[integer_columns] = df_crb[integer_columns].astype(int)
df_crb[str_columns] = df_crb[str_columns].astype(str)
#Add column with description for grouping
df_crb['category'] = 'Café, Restaurant, Bistro'
#drop NaN values
df_crb.dropna(subset=['business_address'])
df_crb.dropna(subset=['longitude'])
df_crb.dropna(subset=['latitude'])
#latest decade
df_crb = df_crb.query("census_year >= 2012")
print(df_crb.shape)
df_crb.head(5).T
#limit data to past decade
df_crb=df_crb.query("census_year >= @this_decade")
df_crb.info()
(33651, 16) <class 'pandas.core.frame.DataFrame'> Int64Index: 33651 entries, 0 to 56986 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 33200 non-null object 1 seating_type 33651 non-null object 2 census_year 33651 non-null int32 3 property_id 33651 non-null int32 4 base_property_id 33651 non-null int32 5 trading_name 33651 non-null object 6 block_id 33651 non-null int32 7 industry_anzsic4_description 33651 non-null object 8 number_of_seats 33651 non-null int32 9 building_address 33651 non-null object 10 clue_small_area 33651 non-null object 11 business_address 33651 non-null object 12 industry_anzsic4_code 33651 non-null int64 13 longitude 33200 non-null float64 14 latitude 33200 non-null float64 15 category 33651 non-null object dtypes: float64(2), int32(5), int64(1), object(8) memory usage: 3.7+ MB
#Merge CLUE block data
clue_venues = df_crb.append(df_btp_capacity)
#combine data from venue datasets to use seats or patrons values as a capacity measure
#similar combination for category created
clue_venues['capacity'] = clue_venues[['number_of_seats', 'number_of_patrons']].bfill(axis=1).iloc[:, 0]
clue_venues['venue_description'] = clue_venues[['category', 'industry_anzsic4_description']].bfill(axis=1).iloc[:, 0]
#rename columns
clue_venues.rename(columns={
"latitude":"lat"
, "longitude":"lon"
},inplace = True)
#fill remaining nulls
clue_venues.fillna(0, inplace=True)
#clue_venues.head(3).T
# spatial layer used to map city activity planned works
dsurl = 'city-activities-and-planned-works'
url = domain + dsurl + baseurl
#print(url)
data_json = requests.get(url).json()
data_json_df = pd.DataFrame.from_dict(data_json)
#this flattens the features
df_capw=json_normalize(data_json_df['fields'])
df_capw.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 605 entries, 0 to 604 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 603 non-null object 1 status 605 non-null object 2 notes 500 non-null object 3 end_date 605 non-null object 4 geo_point_2d 605 non-null object 5 activity_id 605 non-null object 6 classification 605 non-null object 7 geometry 605 non-null object 8 start_date 605 non-null object 9 source_id 605 non-null object 10 small_area 605 non-null object 11 json_geometry.coordinates 605 non-null object 12 json_geometry.type 605 non-null object dtypes: object(13) memory usage: 61.6+ KB
#look at events that are for entertainment
df_capw.dropna(subset=['geometry'])
#Convert to date, add columns
df_capw['start_dt'] = pd.to_datetime(df_capw.start_date).dt.date
df_capw['start_year'] = pd.to_datetime(df_capw.start_dt).dt.year
df_capw['start_month'] = pd.to_datetime(df_capw.start_dt).dt.month
#drop columns
df_capw = df_capw.drop(['json_geometry.type'], axis=1)
#filter found there are records with value 2921-11-19 00:00:00, exclude these
df_capw = df_capw.loc[(df_capw['end_date'] < '2065-01-01')]
df_capw['end_dt'] = pd.to_datetime(df_capw.end_date).dt.date
df_capw['end_year'] = pd.to_datetime(df_capw.end_dt).dt.year
df_capw['end_month'] = pd.to_datetime(df_capw.end_dt).dt.month
df_capw_all = df_capw.copy()
df_capw = df_capw[(df_capw.classification.isin(['Event','PublicEvent','Sport/Recreation']))]
#Range of years
df_capw.start_year.unique()
array([2018, 2022, 2020, 2019, 2023, 2021], dtype=int64)
#Merge CLUE block data
clue_venues_capw=clue_venues.append(df_capw)
#combine values across datasets for year, small area and description
clue_venues_capw['year'] = clue_venues_capw[['census_year', 'start_year']].bfill(axis=1).iloc[:, 0]
clue_venues_capw['year'] = clue_venues_capw['year'].astype(int)
clue_venues_capw['small_area_tag'] = clue_venues_capw[['clue_small_area', 'small_area']].bfill(axis=1).iloc[:, 0]
clue_venues_capw['description_tag'] = clue_venues_capw[['industry_anzsic4_description', 'classification']].bfill(axis=1).iloc[:, 0]
clue_venues_capw['category_tag'] = clue_venues_capw[['category', 'classification']].bfill(axis=1).iloc[:, 0]
#fill remaining nulls
clue_venues_capw.fillna(0, inplace=True)
clue_venues_capw.category_tag.unique()
array(['Café, Restaurant, Bistro', 'Pubs, Taverns and Bars', 'Event'],
dtype=object)
#create data frames per year for some visuals
#the latest data is for the past year
clue_venues_y3=clue_venues_capw.query("year == @y3")
clue_venues_y2=clue_venues_capw.query("year == @y2")
clue_venues_y1=clue_venues_capw.query("year >= @y1") #latest year, see setup for detail
#clue_venues_capw.tail(3).T
The population forecast for the city for the next five years indicates demand due to the increase in population. The pedestrian traffic numbers will be used as an indicator of people potentially using the entertainment venues.
The pedestrian traffic will show us what areas people are visiting and during what part of the day. This can be used to evaluate if the entertainment venue capacity is low, sufficient, or high.
dsurl = 'city-of-melbourne-population-forecasts-by-small-area-2020-2040'
url = domain + dsurl + baseurl
#print(url)
data_json = requests.get(url).json()
data_json_df = pd.DataFrame.from_dict(data_json)
#this flattens the features
ds=json_normalize(data_json_df['fields'])
ds['year']=ds['year'].astype(int)
#limit to next 5 years
fy = pd.to_numeric((this_year + 5))
ds = ds.query("year <= @fy")
#Look at total value for city
ds_smapop_tot = ds.query("age =='Total population' & geography=='City of Melbourne'")
ds_smapop_tot.drop(columns=['gender','age'], inplace=True)
ds_pop = ds_smapop_tot.reset_index(drop=True).sort_values(by=['year'], ascending=True)
ds.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5684 entries, 0 to 17045 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 geography 5684 non-null object 1 year 5684 non-null int32 2 age 5684 non-null object 3 value 5663 non-null float64 4 gender 5684 non-null object dtypes: float64(1), int32(1), object(3) memory usage: 244.2+ KB
#plot population forecast, next 5 years
fig = px.line(ds_pop, x="year", y="value", title='Population Forecast - City of Melbourne')
fig.show()
#Pedestrian sensor location data
dsurl = 'pedestrian-counting-system-sensor-locations'
url = domain + dsurl + baseurl
#print(url)
data_json = requests.get(url).json()
data_json_df = pd.DataFrame.from_dict(data_json)
#this flattens the features
sensor_data=json_normalize(data_json_df['fields'])
sensor_data[['lat', 'lon']] = sensor_data[['latitude', 'longitude']].astype(float)
#sensor_data.head(5).T
sensor_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 117 entries, 0 to 116 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sensor_name 117 non-null object 1 direction_1 86 non-null object 2 sensor_description 117 non-null object 3 direction_2 86 non-null object 4 latitude 117 non-null float64 5 location 117 non-null object 6 sensor_id 117 non-null int64 7 installation_date 117 non-null object 8 longitude 117 non-null float64 9 status 117 non-null object 10 note 14 non-null object 11 lat 117 non-null float64 12 lon 117 non-null float64 dtypes: float64(4), int64(1), object(8) memory usage: 12.0+ KB
#URL / API method will need to be updated
#Pedestrian foot count data zip file
ds_url = "https://data.melbourne.vic.gov.au/api/datasets/1.0/pedestrian-counting-system-monthly-counts-per-hour/attachments/pedestrian_counting_system_monthly_counts_per_hour_may_2009_to_14_dec_2022_csv_zip/"
filename = 'pedestrian_counting_system_monthly_counts_per_hour_may_2009_to_14_dec_2022.csv'
r = requests.get(ds_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
sensor_traffic = pd.read_csv(filename, sep=',')
#sensor_traffic.info()
#rename columns
sensor_traffic.rename(columns={"Date_Time": "date_time","Year":"year"
,"Month":"month"
,"Mdate":"mdate"
,"Day":"day"
,"Time": "time"
,"Sensor_ID":"sensor_id"
,"Sensor_Name":"sensor_name"
,"Hourly_Counts":"hourly_counts"
}
,inplace = True)
#sensor_traffic.head(5).T
#Add date column
sensor_traffic['date'] = pd.to_datetime(sensor_traffic.date_time).dt.date
sensor_traffic['month_num'] = pd.to_datetime(sensor_traffic.date_time).dt.month
#Add day of week column
sensor_traffic['dow'] = pd.to_datetime(sensor_traffic.date_time).dt.day_of_week
#convert fields to integer
sensor_traffic['mdate']=sensor_traffic['mdate'].astype(int)
sensor_traffic['time']=sensor_traffic['time'].astype(int)
sensor_traffic['year']=sensor_traffic['year'].astype(int)
sensor_traffic['mdate']=sensor_traffic['mdate'].astype(int)
sensor_traffic['hourly_counts']=sensor_traffic['hourly_counts'].astype(int)
sensor_traffic['sensor_id']=sensor_traffic['sensor_id'].astype(int)
# Mesh pedestrian sensor location and foot traffic datasets
sensor_traffic = pd.merge(sensor_traffic, sensor_data, on='sensor_id', how='inner')
#filter to this decade
sensor_traffic=sensor_traffic.query("year >= @this_decade")
#Add column for day (5am to 5pm) or night (6pm to 4am) traffic
sensor_traffic['day_counts'] = np.where(((sensor_traffic['time']>4) & (sensor_traffic['time']<18)),
sensor_traffic['hourly_counts'] , 0).astype(int)
sensor_traffic['night_counts'] = np.where(sensor_traffic['day_counts']==0,sensor_traffic['hourly_counts']
, 0).astype(int)
sensor_traffic['when'] = np.where((sensor_traffic['day_counts']>0),'day', 'night')
sensor_traffic.rename(columns={"sensor_name_x": "sensor_name"}
,inplace = True)
sensor_traffic.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4527683 entries, 0 to 4977922 Data columns (total 28 columns): # Column Dtype --- ------ ----- 0 ID int64 1 date_time object 2 year int32 3 month object 4 mdate int32 5 day object 6 time int32 7 sensor_id int32 8 sensor_name object 9 hourly_counts int32 10 date object 11 month_num int64 12 dow int64 13 sensor_name_y object 14 direction_1 object 15 sensor_description object 16 direction_2 object 17 latitude float64 18 location object 19 installation_date object 20 longitude float64 21 status object 22 note object 23 lat float64 24 lon float64 25 day_counts int32 26 night_counts int32 27 when object dtypes: float64(4), int32(7), int64(3), object(14) memory usage: 880.9+ MB
sensor_traffic.year.unique()
array([2019, 2014, 2015, 2016, 2017, 2018, 2020, 2021, 2022, 2013, 2012])
#group by traffic for past decade 2012 to 2022
#average day_counts, night_counts, hourly counts per month, year,all areas
this_year = (pd.Timestamp.today().year)
sensor_ds=sensor_traffic.query("year >= @this_year")
#will use this to show traffic in entertainment locations this year
sensor_ds_yearll = sensor_traffic.groupby(['year','sensor_name','lat','lon'],as_index=False).agg(
{'hourly_counts': 'mean','day_counts':'mean','night_counts':'mean'})
sensor_ds_year = sensor_traffic.groupby(['year'],as_index=False).agg(
{'hourly_counts': 'mean','day_counts':'mean','night_counts':'mean'})
sensor_ds_ym = sensor_ds.groupby(['year','month_num'],as_index=False).agg(
{'hourly_counts': 'mean','day_counts':'mean','night_counts':'mean'})
sensor_ds_ymd = sensor_ds.groupby(['year','month_num', 'dow'],as_index=False).agg(
{'hourly_counts': 'mean','day_counts':'mean','night_counts':'mean'})
sensor_ds_hod = sensor_traffic.groupby(['time'],as_index=False).agg(
{'hourly_counts': 'mean','day_counts':'mean','night_counts':'mean'})
sensor_ds_hod.head(5)
| time | hourly_counts | day_counts | night_counts | |
|---|---|---|---|---|
| 0 | 0 | 121.358861 | 0.0 | 121.358861 |
| 1 | 1 | 72.797493 | 0.0 | 72.797493 |
| 2 | 2 | 46.606735 | 0.0 | 46.606735 |
| 3 | 3 | 35.748762 | 0.0 | 35.748762 |
| 4 | 4 | 27.686117 | 0.0 | 27.686117 |
Map the number of seats or number of patrons from the CLUE survey responses from venues including bars, pubs and taverns and cafes, bistros and restaurants. The capacity measure is a total of the types of venues.
#group dat afor latest survey response
clue_venues_ds = clue_venues_y1.groupby(['census_year', 'clue_small_area','block_id'
,'lon','lat','category_tag','description_tag'],as_index=False).agg(
{'number_of_patrons': 'sum','number_of_seats':'sum','capacity':'sum'})
clue_venues_ds = clue_venues_ds.sort_values(by=['capacity'], ascending=False)
clue_venues_ds.head(5).T
| 235 | 1375 | 1580 | 366 | 352 | |
|---|---|---|---|---|---|
| census_year | 2021.0 | 2021.0 | 2021.0 | 2021.0 | 2021.0 |
| clue_small_area | Docklands | Melbourne (Remainder) | Southbank | Kensington | East Melbourne |
| block_id | 1103.0 | 659.0 | 803.0 | 570.0 | 658.0 |
| lon | 144.94756 | 144.97904 | 144.95833 | 144.9127 | 144.98319 |
| lat | -37.81674 | -37.82134 | -37.82316 | -37.78907 | -37.82001 |
| category_tag | Pubs, Taverns and Bars | Café, Restaurant, Bistro | Café, Restaurant, Bistro | Café, Restaurant, Bistro | Café, Restaurant, Bistro |
| description_tag | 0 | Catering Services | Cafes and Restaurants | Horse and Dog Racing Administration and Track ... | Cafes and Restaurants |
| number_of_patrons | 5500.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| number_of_seats | 0.0 | 4970.0 | 4151.0 | 3660.0 | 3415.0 |
| capacity | 5500.0 | 4970.0 | 4151.0 | 3660.0 | 3415.0 |
# Display the choropleth map
fig = px.choropleth_mapbox(
clue_venues_ds, #dataset
geojson=clueblocks, #CLUE Block spatial data
locations='block_id',
color='capacity',
color_continuous_scale='sunset', #colour scale ylgn / sunset / geyser
range_color=(0, df_btp_capacity['number_of_patrons'].max()), #range for the colour scale
featureidkey="properties.block_id",
mapbox_style="carto-positron", #map style
zoom=11.75, #zoom level
center = {"lat": -37.81216592937499, "lon": 144.961812290625}, # set the map centre coordinates on Melbourne
opacity=0.7,
hover_name='clue_small_area', #title of the pop up box
hover_data={'census_year':True, 'block_id':True, 'number_of_patrons':True,
'number_of_seats':True, 'capacity':True, 'description_tag': True,
'lon':False, 'lat':False, 'category_tag':True
}, #values to display in the popup box
labels={'number_of_patrons':'Number of Patrons','block_id':'Block Id',
'number_of_seats':'Number of Seats',
'capacity':'Capacity','census_year':'Census Year',
'category_tag':'Category','description_tag':'Description'
},
title='Venue Capacity', #Title for plot
width=950, height=800 #dimensions of plot in pixels
)
#show year 3
fig3 = px.scatter_mapbox(
clue_venues_y3, lat="lat", lon="lon",
opacity=0.8,
hover_name='clue_small_area', # the title of the hover pop up box
hover_data={'census_year':True,'block_id':True,'number_of_patrons':True,
'number_of_seats':True, 'capacity' : True, 'description_tag': True,
'lat':False,'lon':False}, #values to display in the popup box
color_discrete_sequence=['blue'],
labels={'capacity':'Capacity','block_id':'Block Id','description_tag':'Description',
'census_year':'Census Year', 'number_of_patrons': 'Number of Patrons',
'number_of_seats':'Number of Seats'
}, #labels
)
#show year 2
fig2 = px.scatter_mapbox(
clue_venues_y2, lat="lat", lon="lon",
opacity=0.7,
hover_name='clue_small_area', # the title of the hover pop up box
hover_data={'census_year':True,'block_id':True,'number_of_patrons':True,
'number_of_seats':True, 'capacity' : True, 'description_tag': True,
'lat':False,'lon':False}, #values to display in the popup box
color_discrete_sequence=['cyan'],
labels={'capacity':'Capacity','block_id':'Block Id','description_tag':'Description',
'census_year':'Census Year', 'number_of_patrons': 'Number of Patrons',
'number_of_seats':'Number of Seats'
}, #labels
)
#show year 1
fig1 = px.scatter_mapbox(
clue_venues_y1, lat="lat", lon="lon",
opacity=0.75,
hover_name='clue_small_area', #title of the pop up box
hover_data={'census_year':True,'block_id':True,'number_of_patrons':True,
'number_of_seats':True, 'capacity' : True, 'description_tag': True,
'lat':False,'lon':False}, #values to display in the popup box
color_discrete_sequence=['purple'],
size_max=20, zoom=10,
labels={'capacity':'Capacity','block_id':'Block Id','description_tag':'Description',
'census_year':'Census Year', 'number_of_patrons': 'Number of Patrons',
'number_of_seats':'Number of Seats'
}, #labels
)
#differentiate recent years for interest
fig.add_trace(fig3.data[0])
fig.add_trace(fig2.data[0])
fig.add_trace(fig1.data[0])
fig.update_geos(fitbounds="locations", visible=True)
fig.show()
From the map above, looking at small area blocks by capacity, we can see .
Assess venue capacity to accommodate both number of seats and number of patrons, based on CLUE survey responses for 2021.
From the chart below can see that West Melbourne in responses from 2021 has no entertainment venues. There were responses for this area in previous years.
The highest number of entertainment venues are in Memblourne (CBD), Southbank, Docklands, Carlton and Melbourne (Remainder).
#group
df = clue_venues_y1.groupby(['small_area_tag','category_tag','seating_type'],as_index=False).agg(
{'capacity':'sum'})
#filter values
df = df.query("capacity > 0")
#sort
df= df.sort_values(by=['capacity'], ascending=True)
#plot
fig = px.bar(df, y='small_area_tag', x='capacity', hover_data=["seating_type"],
color='category_tag', title="Venue Capacity")
fig.show()
sensor_ds_yearll.head()
| year | sensor_name | lat | lon | hourly_counts | day_counts | night_counts | |
|---|---|---|---|---|---|---|---|
| 0 | 2012 | Australia on Collins | -37.815734 | 144.965210 | 711.437158 | 615.525159 | 95.911999 |
| 1 | 2012 | Birrarung Marr | -37.818629 | 144.971694 | 345.953893 | 229.489071 | 116.464822 |
| 2 | 2012 | Bourke Street Mall (North) | -37.813494 | 144.965153 | 1059.452983 | 850.250569 | 209.202413 |
| 3 | 2012 | Bourke Street Mall (South) | -37.813807 | 144.965167 | 1146.758652 | 962.091985 | 184.666667 |
| 4 | 2012 | Collins Place (North) | -37.813449 | 144.973054 | 341.213115 | 289.785291 | 51.427823 |
#Create a map object centered on Melbourne
map = folium.Map(location=[-37.81216592937499,
144.961812290625],
zoom_start=20)
outline1 = {'fillColor': 'orange', 'color': 'teal'}
outline2 = {'fillColor': 'lightorange', 'color': 'lightgreen'}
#Add all sensor locations
location_data = sensor_ds_yearll
for i in range(0,len(location_data)):
folium.Marker(
icon=folium.Icon(color="purple", icon="info-sign"),
location=[location_data.iloc[i]['lat'], location_data.iloc[i]['lon']],
tooltip=location_data.iloc[i]['sensor_name'],
popup=['<p><h3>Pedestrian Traffic</h3></p><b>Day</b>',location_data.iloc[i]['day_counts'], '<b>Night</b>', location_data.iloc[i]['night_counts']]
).add_to(map)
map
def summarize_within(input_gdf, input_summary_gdf, in_fields, out_fields = None, aggfunc='mean'):
'''
Overlays a polygon layer with another layer to calculate attribute field statistics about those features (input_summary_gdf) within the polygons (input_gdf).
Parameters:
input_gdf: Geodataframe of the polygons in which features will be summarized by.
input_summary_gdf: Geodataframe of features that will be summarized
in_fields: name of the fields (in input_summary_gdf) that will be summarized
out_fields: name that the fields will have after they're summarized
aggfunc: function that will be used to summarize
Returns:
A geodataframe with 'input_gdf' polygons and the attributes of 'input_summary_gdf' summarized by each polygon.
'''
input_gdf = input_gdf.copy()
input_summary_gdf = input_summary_gdf.copy()
print(input_summary_gdf.columns)
if out_fields == None:
out_fields = in_fields
#Merges the dwelling points with the input_polygons. A new column "index right" is created. It indicates in what cell the property is within.
merged = gpd.sjoin(input_summary_gdf, input_gdf, how='left')
#Now lets count how many properties are within each cell
dissolve = merged.dissolve(by="index_right", aggfunc=aggfunc) #Dissolve (looks like groupby) by the cell index
for in_field, out_field in zip(in_fields, out_fields):
input_gdf.loc[dissolve.index, out_field] = dissolve[in_field].values #Putting number of properties in input_polygons gdf
return input_gdf.round(2)
gdf_capw = gpd.GeoDataFrame(df_capw_all, geometry = df_capw_all['geometry'].apply(lambda wkt: loads(wkt)))
df_crb_y3 = df_crb[df_crb['census_year']==2020]
df_crb_y3_gdf = gpd.GeoDataFrame(df_crb_y3, geometry = df_crb_y3[['longitude','latitude']].apply(lambda coord : Point(coord[0], coord[1]), axis=1))
gdf_capw['Number of seats'] = summarize_within(gdf_capw, df_crb_y3_gdf, ['number_of_seats'], aggfunc='sum').dropna(subset=['number_of_seats'])['number_of_seats']
gdf_capw['Number of seats'] = gdf_capw['Number of seats'].fillna(0)
df_btp_capacity_y3 = df_btp_capacity[df_btp_capacity['census_year']==2020]
df_btp_capacity_y3_gdf = gpd.GeoDataFrame(df_btp_capacity_y3, geometry = df_btp_capacity_y3[['longitude','latitude']].apply(lambda coord : Point(coord[0], coord[1]), axis=1))
gdf_capw['Number of patrons'] = summarize_within(gdf_capw, df_btp_capacity_y3_gdf, ['number_of_patrons'], aggfunc='sum').dropna(subset=['number_of_patrons'])['number_of_patrons']
gdf_capw['Number of patrons'] = gdf_capw['Number of patrons'].fillna(0)
Index(['location', 'seating_type', 'census_year', 'property_id',
'base_property_id', 'trading_name', 'block_id',
'industry_anzsic4_description', 'number_of_seats', 'building_address',
'clue_small_area', 'business_address', 'industry_anzsic4_code',
'longitude', 'latitude', 'category', 'geometry'],
dtype='object')
Index(['longitude', 'census_year', 'building_address', 'trading_name',
'location', 'property_id', 'business_address', 'latitude',
'clue_small_area', 'block_id', 'number_of_patrons', 'base_property_id',
'category', 'geometry'],
dtype='object')
def plot_map(gdf, col1, col2, col3, title):
fig = go.Figure(go.Choroplethmapbox(geojson=gdf.__geo_interface__, locations=gdf.index, z=gdf[col1],
colorscale="YlOrRd", zmin=gdf[col1].min(), zmax=gdf[col1].max(),
marker_opacity=1, marker_line_width=0, ))
x,y = box(*gdf.total_bounds).centroid.xy
print(gdf[col1].max())
fig.update_layout(mapbox_style="carto-positron", mapbox_center = {"lat": y[0], "lon": x[0]}, mapbox_zoom=12)
matter_r= [[0.0, '#2f0f3d'], #cmocean colorscale
[0.1, '#4f1552'],
[0.2, '#72195f'],
[0.3, '#931f63'],
[0.4, '#b32e5e'],
[0.5, '#cf4456'],
[0.6, '#e26152'],
[0.7, '#ee845d'],
[0.8, '#f5a672'],
[0.9, '#faca8f'],
[1.0, '#fdedb0']]
button1 = dict(method= 'update',
label=col1,
args=[
{"z": [gdf[col1]],
"zmax":[gdf[col1].max()],
"zmin":[gdf[col1].min()]
}, #dict for fig.data[0] updates
{"coloraxis.colorscale":"Viridis" } #dict for layout attribute update
])
button2 = dict(method= 'update',
label=col2,
args=[
{"z": [gdf[col2]],
"zmax":[gdf[col2].max()],
"zmin":[gdf[col2].min()]
},
{"coloraxis.colorscale": matter_r} #update layout attribute
])
button3 = dict(method= 'update',
label=col3,
args=[
{"z": [gdf[col3]],
"zmax":[gdf[col3].max()],
"zmin":[gdf[col3].min()]
},
{"coloraxis.colorscale": matter_r} #update layout attribute
])
fig.update_layout(updatemenus=[dict(active=0,
buttons= [button1, button2, button3])]
)
fig.update_layout(title_text = title, title_x=0.5)
return fig
#gdf_capw = gpd.GeoDataFrame(df_capw, geometry='geometry')
plot_map(gdf_capw, 'end_year', 'Number of seats', 'Number of patrons', 'Planned activity and works')
2028
results_seats = []
for small_area in tqdm(df_crb_y3_gdf['clue_small_area'].unique()):
intersection_mask = df_crb_y3_gdf[df_crb_y3_gdf['clue_small_area']==small_area].intersects(unary_union(gdf_capw['geometry']))
seats_within_apw = df_crb_y3_gdf[df_crb_y3_gdf['clue_small_area']==small_area][intersection_mask]['number_of_seats'].sum()
total_seats = df_crb_y3_gdf[df_crb_y3_gdf['clue_small_area']==small_area]['number_of_seats'].sum()
results_seats.append({"clue_small_area":small_area, 'number_of_seats':seats_within_apw, 'percentage_of_seats':(seats_within_apw/total_seats)*100})
results_seats = pd.DataFrame(results_seats)
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:27<00:00, 2.12s/it]
fig = go.Figure()
fig.add_trace(
go.Bar(x =results_seats['clue_small_area'], y=results_seats['number_of_seats'])
)
fig.update_layout(title_text = 'Number of seats of busineses located within planned activity and works area', title_x=0.5)
fig.show()
fig = go.Figure()
fig.add_trace(
go.Bar(x =results_seats['clue_small_area'], y=results_seats['percentage_of_seats'])
)
fig.update_layout(title_text = 'Percentage of seats located within planned activity and works area', title_x=0.5)
fig.show()
results_patrons = []
for small_area in tqdm(df_btp_capacity_y3_gdf['clue_small_area'].unique()):
intersection_mask = df_btp_capacity_y3_gdf[df_btp_capacity_y3_gdf['clue_small_area']==small_area].intersects(unary_union(gdf_capw['geometry']))
patrons_within_apw = df_btp_capacity_y3_gdf[df_btp_capacity_y3_gdf['clue_small_area']==small_area][intersection_mask]['number_of_patrons'].sum()
total_patrons = df_btp_capacity_y3_gdf[df_btp_capacity_y3_gdf['clue_small_area']==small_area]['number_of_patrons'].sum()
results_patrons.append({"clue_small_area":small_area, 'number_of_patrons':patrons_within_apw, 'percentage_of_patrons':(patrons_within_apw/total_patrons)*100})
results_patrons = pd.DataFrame(results_patrons)
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00, 3.05it/s]
fig = go.Figure()
fig.add_trace(
go.Bar(x =results_patrons['clue_small_area'], y=results_patrons['number_of_patrons'])
)
fig.update_layout(title_text = 'Patrons capacity of busineses located within planned activity and works area', title_x=0.5)
fig.show()
fig = go.Figure()
fig.add_trace(
go.Bar(x =results_patrons['clue_small_area'], y=results_patrons['percentage_of_patrons'])
)
fig.update_layout(title_text = 'Percentage of patrons capacity located within planned activity and works area', title_x=0.5)
fig.show()
In this section we are trying to project the return of pedestrian traffic to the city. The reasoning is that if pedestrian traffic increases it is an indicator of people returning to the city. The time of the day, and where that pedestrian activity occurs is also an indicator of whether the interaction is associated with work or for leisure activities.
Specifically we can associate evening or night time activity for entertainment, either to venues such as bars, pubs, taverns, and restaurants, or for events. Apart from night time traffic, we will use the small area population forecast, and venue capacity, to project growth areas.
df = sensor_ds_year
#identify outliers 2020 and 2021 - peak Covid impact
out_values = [2020, 2021, 2022]
#drop any rows for outlier values in the year column
df = df[df.year.isin(out_values) == False]
#print(df.year.unique())
fig = px.scatter(df, x="year", y=["hourly_counts","day_counts","night_counts"], trendline="ols")
fig.show()
sensor_ds_hod
| time | hourly_counts | day_counts | night_counts | |
|---|---|---|---|---|
| 0 | 0 | 121.358861 | 0.000000 | 121.358861 |
| 1 | 1 | 72.797493 | 0.000000 | 72.797493 |
| 2 | 2 | 46.606735 | 0.000000 | 46.606735 |
| 3 | 3 | 35.748762 | 0.000000 | 35.748762 |
| 4 | 4 | 27.686117 | 0.000000 | 27.686117 |
| 5 | 5 | 45.943161 | 45.943161 | 0.000000 |
| 6 | 6 | 130.111327 | 130.111327 | 0.000000 |
| 7 | 7 | 312.831976 | 312.831976 | 0.000000 |
| 8 | 8 | 624.821551 | 624.821551 | 0.000000 |
| 9 | 9 | 504.827384 | 504.827384 | 0.000000 |
| 10 | 10 | 526.240045 | 526.240045 | 0.000000 |
| 11 | 11 | 653.332586 | 653.332586 | 0.000000 |
| 12 | 12 | 948.004744 | 948.004744 | 0.000000 |
| 13 | 13 | 991.708486 | 991.708486 | 0.000000 |
| 14 | 14 | 862.253786 | 862.253786 | 0.000000 |
| 15 | 15 | 862.161487 | 862.161487 | 0.000000 |
| 16 | 16 | 934.410323 | 934.410323 | 0.000000 |
| 17 | 17 | 1098.771602 | 1098.771602 | 0.000000 |
| 18 | 18 | 857.967536 | 0.000000 | 857.967536 |
| 19 | 19 | 631.734833 | 0.000000 | 631.734833 |
| 20 | 20 | 505.023591 | 0.000000 | 505.023591 |
| 21 | 21 | 429.567774 | 0.000000 | 429.567774 |
| 22 | 22 | 350.414917 | 0.000000 | 350.414917 |
| 23 | 23 | 222.786239 | 0.000000 | 222.786239 |
df = sensor_ds_hod
#drop any rows for outlier values in the year column
#df = df[df.year.isin(values) == False]
fig = px.scatter(df, x="time", y=["hourly_counts","day_counts","night_counts"], trendline="ols")
fig.show()
Add narrative
#Model for forecasting location growth using datasets loaded earlier
#Todo:
#Initial linear regression, compare to another model LSTM
#Techniques for controlling jittering
#Normalise data to 28 day period per month example 28/31 * measure
# eg: 28/31 * pedestrian count
#look at areas with high demand based on pedestrian traffic by month, dow and hod
X = merged_final.drop(columns='hourly_counts') #is the actual value
y = merged_final.hourly_counts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
#And then do the regression.
LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))
#Linear Regression
LR = LinearRegression(fit_intercept=False)
LR.fit(X_train, y_train)
print("The basic Linear Regression R-squared score: ", LR.score(X_test, y_test))
#Decision Tree Regressor
DT = DecisionTreeRegressor(max_depth = 75)
DT.fit(X_train, y_train)
print("The Decision Tree regressor's R-squared score: ", DT.score(X_test, y_test))
#Random Forest Regressor
RFR = RandomForestRegressor(n_estimators=150, max_depth=100, n_jobs= -1, max_features=100)
RFR.fit(X_train, y_train)
print("The Random Forest regressor's R-squared score: ", RFR.score(X_test, y_test))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) C:\Users\PCUSER~1\AppData\Local\Temp/ipykernel_3748/3465649625.py in <module> ----> 1 X = merged_final.drop(columns='hourly_counts') #is the actual value 2 y = merged_final.hourly_counts 3 4 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 5 NameError: name 'merged_final' is not defined
City of Melbourne Open Data Team, 2016 - 2022,'Bar, tavern, pub patron capacity 2020', City of Melbourne, date retrieved 26 Nov 2022, https://data.melbourne.vic.gov.au/explore/dataset/bars-and-pubs-with-patron-capacity/information/
City of Melbourne Open Data Team, 2015 - 2022,'Cafe, restaurant, bistro seats 2020', City of Melbourne, date retrieved 26 Nov 2022, https://data.melbourne.vic.gov.au/explore/dataset/cafes-and-restaurants-with-seating-capacity/information/
City of Melbourne Open Data Team, 2021 - 2022,'City Activities and Planned Works', City of Melbourne, date retrieved 26 Nov 2022, https://data.melbourne.vic.gov.au/explore/dataset/city-activities-and-planned-works/information/?disjunctive.classification&disjunctive.small_area
City of Melbourne Open Data Team, 2014 - 2021,'Pedestrian Counting System - Monthly (counts per hour)', City of Melbourne, date retrieved 03 Dec 2022, https://melbournetestbed.opendatasoft.com/explore/dataset/pedestrian-counting-system-monthly-counts-per-hour/information/
City of Melbourne Open Data Team, 2018 - 2021,'Pedestrian Counting System - Sensor Locations', City of Melbourne, date retrieved 03 Dec 2022, https://data.melbourne.vic.gov.au/explore/dataset/pedestrian-counting-system-sensor-locations/information/
City of Melbourne Open Data Team, 2021 - 2022,'City of Melbourne Population Forecasts by Small Area 2021-2041', City of Melbourne, date retrieved 15 Dec 2022, https://data.melbourne.vic.gov.au/explore/dataset/city-of-melbourne-population-forecasts-by-small-area-2020-2040/information/